In [ ]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [ ]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

twenty_train.target_names

In [ ]:
# Sample data
print(twenty_train.data[0])
print('---------------')
print('Target: ', twenty_train.target[0])

TASK 1: Create terms documents matrix


In [ ]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer

#Define the count vectorizer here
...

# Fit and apply the count vectorizer with the train texts.
X_train_counts = ...

X_train_counts.shape

In [ ]:
# Visualize the results, first row and first column
print(X_train_counts[0,:])
print(X_train_counts[:,0])

TASK 2: TF-IDF transformation


In [ ]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

# Define the TF-IDF transformer here and fit it
tfidf_transformer = ...

# Apply the transformer to the train matrix of documents terms.
X_train_tf = ...

X_train_tf.shape

In [ ]:
# Visualize the results, first row and first column
print(X_train_tf[0,:])
print(X_train_tf[:,0])

First basic model


In [ ]:
from sklearn.naive_bayes import MultinomialNB

# Define and fit in one line
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

TASK 3: Define a score procces


In [ ]:
#Score test data

# Read test data
twenty_test = fetch_20newsgroups(subset='test',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

# Transform text to counts
X_test_counts = ...

# tf-idf transformation
X_test_tf = ...

# Prediction
predicted = ...

# Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy test: ', accuracy_score(twenty_test.target, predicted))

In [ ]:

TASK 4: Build a pipeline


In [ ]:
#Define the pipeline

from sklearn.pipeline import Pipeline

text_clf = Pipeline(...)

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)

In [ ]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

np.mean(predicted == twenty_test.target)

Change classifier in the pipeline


In [ ]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
                    ])
#Fit
_ = text_clf.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf.predict(twenty_test.data)

# Evaluate accuracy
np.mean(predicted == twenty_test.target)

In [ ]:

Other classifier


In [ ]:
from sklearn import svm
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC()),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

In [ ]:

Task5: Optimize a pipeline


In [ ]:
from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', ...),
                ('tfidf', ...),
                ('clf', ...),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": ..., 
              "vect__stop_words": ..., 
              "clf__C": ...}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(...)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")

In [ ]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results

In [ ]:
# Score & evaluate test data using the best estimator

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC(C=1.5)),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

In [ ]:

Aditional metrics for multiclass classification


In [ ]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))

In [ ]:
metrics.confusion_matrix(twenty_test.target, predicted)